Data source: https://archive.ics.uci.edu/ml/datasets/Beijing+Multi-Site+Air-Quality+Data

The data set includes hourly air pollutants data from 12 nationally-controlled air-quality monitoring sites. The air-quality data are from the Beijing Municipal Environmental Monitoring Center. The meteorological data in each air-quality site are matched with the nearest weather station from the China Meteorological Administration. The time period is from March 1st, 2013 to February 28th, 2017. Missing data are denoted as NA.

Attribute Information:

No: row number

year: year of data in this row

month: month of data in this row

day: day of data in this row

hour: hour of data in this row

PM2.5: PM2.5 concentration (ug/m^3)

PM10: PM10 concentration (ug/m^3)

SO2: SO2 concentration (ug/m^3)

NO2: NO2 concentration (ug/m^3)

CO: CO concentration (ug/m^3)

O3: O3 concentration (ug/m^3)

TEMP: temperature (degree Celsius)

PRES: pressure (hPa)

DEWP: dew point temperature (degree Celsius)

RAIN: precipitation (mm)

wd: wind direction

WSPM: wind speed (m/s)

station: name of the air-quality monitoring site

library(corrplot)
## corrplot 0.92 loaded
library(RColorBrewer)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(seasonal)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(dplyr)
library(TSstudio)
library(xts)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:data.table':
## 
##     first, last
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(tseries)
library(ggplot2)
data <- read.csv("PRSA_Data_Wanshouxigong_20130301-20170228.csv", header = TRUE)

We are going to study the air pollution condition in Beijing, Wanshou xigong.

attach(data)

sum(is.na(data))
## [1] 5146
column_nane_list <- colnames(data)

for (a in column_nane_list){
  
  print(a)
  data <-data %>% fill(a,.direction = 'updown')
}
## [1] "No"
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(a)
## 
##   # Now:
##   data %>% select(all_of(a))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## [1] "year"
## [1] "month"
## [1] "day"
## [1] "hour"
## [1] "PM2.5"
## [1] "PM10"
## [1] "SO2"
## [1] "NO2"
## [1] "CO"
## [1] "O3"
## [1] "TEMP"
## [1] "PRES"
## [1] "DEWP"
## [1] "RAIN"
## [1] "wd"
## [1] "WSPM"
## [1] "station"
sum( apply(data,2, function(x) is.na(x)))
## [1] 0

We fill the missing value by the previous value.If the previous value is not available, we will fill by the next value.

There are two type of information in the dataset. One is the diffrent index of the air pollutants (e.g. PM2.5, PM10, SO2, NO2, CO, O3). And the other type is the weather condition (e.g. TEMP, PRES, DEWP, RAIN, wd, WSPM).

grouped_year = aggregate(cbind(PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,WSPM) ~ data$year, data = data, FUN = mean, na.rm = TRUE)

grouped_month = aggregate(cbind(PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES, DEWP,RAIN,WSPM) ~ data$year+ data$month, data = data, FUN = mean, na.rm = TRUE)

grouped_day = aggregate(cbind(PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES, DEWP,RAIN,WSPM) ~ data$year+ data$month + data$day, data = data, FUN = mean, na.rm = TRUE)


grouped_year <- grouped_year[
  order( grouped_year[,1] ),
]

grouped_month <- grouped_month[
  order( grouped_month[,1], grouped_month[,2] ),
]

grouped_day <- grouped_day[
  order( grouped_day[,1], grouped_day[,2], grouped_day[,3] ),
]
corr_month <- cor(grouped_month[,-c(1:2)])

corrplot(corr_month, type="upper", order="hclust",
         col=brewer.pal(n=8, name="RdYlBu"))

We have found that the air pollutants have high correlation between each other (e.g. PM2.5, PM10, CO, SO2 and NO2 has positive correlation, but O3 has the negativity correlation with other air pollutants.)

And we also have found that the weather conditions also have high correlation with the air pollutants (e.g temperature, precipitation has negativity correlation with air pollutant(PM2.5,PM10, CO, SO2, NO2))

plot data by time

grouped_month  <- grouped_month [,-c(1:2)]

ts_by_month <- ts(grouped_month , start = c(2013, 3), frequency = 12)
plot.ts(ts_by_month[,1:6])

plot.ts(ts_by_month[,7:11])

grouped_day <- grouped_day [,-c(1:3)]

dates_by_day <- seq(as.Date("2013-03-01"),length = nrow(grouped_day), by ="day")

ts_by_day <- xts(grouped_day,order.by = dates_by_day, frequency = "daily")

ts_heatmap(ts_by_day[,'PM2.5'],title = " Heatmap - the PM2.5 concentration in Wanshou xigong")
dates_by_month <- seq(as.Date("2013-03-01"),length = nrow(grouped_month), by ="month")



xts_by_month <- xts(grouped_month, order.by = dates_by_month , frequency = 12)



# 
# plot.xts(xts_by_month[,1:5], multi.panel = TRUE)
# 
# plot.xts(xts_by_month[,11])


# ts_by_day <- xts(grouped_day, order.by = dates, frequency = 12)
# cycle(ts_by_month)

boxplot(ts_by_month [,'PM2.5'] ~ cycle(ts_by_month [,'PM2.5']))

boxplot(ts_by_month [,'PM10'] ~ cycle(ts_by_month [,'PM10']))

boxplot(ts_by_month [,'SO2'] ~ cycle(ts_by_month [,'SO2']))

boxplot(ts_by_month [,'NO2'] ~ cycle(ts_by_month [,'NO2']))

boxplot(ts_by_month [,'CO'] ~ cycle(ts_by_month [,'CO']))

boxplot(ts_by_month [,'O3'] ~ cycle(ts_by_month [,'O3']))

boxplot(ts_by_month [,'TEMP'] ~ cycle(ts_by_month [,'TEMP']))

boxplot(ts_by_month [,'DEWP'] ~ cycle(ts_by_month [,'DEWP']))

boxplot(ts_by_month [,'PRES'] ~ cycle(ts_by_month [,'PRES']))

boxplot(ts_by_month [,'RAIN'] ~ cycle(ts_by_month [,'RAIN']))

boxplot(ts_by_month [,'WSPM'] ~ cycle(ts_by_month [,'WSPM']))

We can observe the distribution of difference variable among month

fit decomposition models for PM2.5

# additive decomposition
add_decomp_PM25 <- decompose(ts_by_month[,'PM2.5'], type = 'additive')
add_decomp<- decompose(ts_by_month, type = 'additive')
add_decomp_PM25
## $x
##            Jan       Feb       Mar       Apr       May       Jun       Jul
## 2013                     106.69892  78.82778  81.71640 105.67153  68.14651
## 2014 113.49194 156.06399  98.66667  86.64444  59.18804  58.03889  85.52151
## 2015 103.28199 102.95833  87.32258  71.85556  55.34543  59.44583  61.57124
## 2016  75.26210  48.35201  95.18952  67.05417  55.25806  64.08889  74.10753
## 2017 132.08333  77.61905                                                  
##            Aug       Sep       Oct       Nov       Dec
## 2013  59.41532  61.74306  98.97849  84.30972  92.30914
## 2014  64.41331  70.99903 114.25806 103.31708  67.94247
## 2015  45.68548  53.15000  77.78226 125.42639 165.81989
## 2016  50.55242  59.25139  83.71640 106.37361 156.83468
## 2017                                                  
## 
## $seasonal
##             Jan        Feb        Mar        Apr        May        Jun
## 2013                        12.596727  -5.698226 -24.380240 -21.655511
## 2014  16.017806  21.170881  12.596727  -5.698226 -24.380240 -21.655511
## 2015  16.017806  21.170881  12.596727  -5.698226 -24.380240 -21.655511
## 2016  16.017806  21.170881  12.596727  -5.698226 -24.380240 -21.655511
## 2017  16.017806  21.170881                                            
##             Jul        Aug        Sep        Oct        Nov        Dec
## 2013  -9.601027 -28.952748 -21.657639  13.707981  21.583772  26.868223
## 2014  -9.601027 -28.952748 -21.657639  13.707981  21.583772  26.868223
## 2015  -9.601027 -28.952748 -21.657639  13.707981  21.583772  26.868223
## 2016  -9.601027 -28.952748 -21.657639  13.707981  21.583772  26.868223
## 2017                                                                  
## 
## $trend
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
## 2013                         NA       NA       NA       NA       NA       NA
## 2014 87.14031 88.07252 88.66643 89.68875 91.11737 90.89406 89.45337 86.81522
## 2015 81.22384 79.44558 77.92155 75.65801 75.05941 80.05886 82.96959 79.52683
## 2016 78.40901 79.13415 79.59116 80.09264 79.54603 78.37778 80.37095 83.95796
## 2017       NA       NA                                                      
##           Sep      Oct      Nov      Dec
## 2013 91.94639 91.93741 91.32442 88.40104
## 2014 84.12981 83.04094 82.26463 82.16314
## 2015 77.57935 77.70708 77.50339 77.69321
## 2016       NA       NA       NA       NA
## 2017                                    
## 
## $random
##               Jan          Feb          Mar          Apr          May
## 2013                                     NA           NA           NA
## 2014  10.33382068  46.82059124  -2.59649089   2.65392485  -7.54908995
## 2015   6.04034552   2.34186903  -3.19569409   1.89576779   4.66626068
## 2016 -19.16472204 -51.95301612   3.00162914  -7.34024848   0.09227343
## 2017           NA           NA                                       
##               Jun          Jul          Aug          Sep          Oct
## 2013           NA           NA           NA  -8.54569396  -6.66689181
## 2014 -11.19966299   5.66916168   6.55083331   8.52685202  17.50914281
## 2015   1.04248834 -11.79732274  -4.88859599  -2.77171390 -13.63280684
## 2016   7.36661881   3.33760522  -4.45279316           NA           NA
## 2017                                                                 
##               Nov          Dec
## 2013 -28.59846862 -22.96012658
## 2014  -0.53131766 -41.08889239
## 2015  26.33923044  61.25846313
## 2016           NA           NA
## 2017                          
## 
## $figure
##  [1]  12.596727  -5.698226 -24.380240 -21.655511  -9.601027 -28.952748
##  [7] -21.657639  13.707981  21.583772  26.868223  16.017806  21.170881
## 
## $type
## [1] "additive"
## 
## attr(,"class")
## [1] "decomposed.ts"
plot(add_decomp_PM25)

# multiplicative decomposition
multi_decomp_PM25 <- decompose(ts_by_month[,'PM2.5'], type = 'multiplicative')
multi_decomp <- decompose(ts_by_month, type = 'multiplicative')
multi_decomp_PM25
## $x
##            Jan       Feb       Mar       Apr       May       Jun       Jul
## 2013                     106.69892  78.82778  81.71640 105.67153  68.14651
## 2014 113.49194 156.06399  98.66667  86.64444  59.18804  58.03889  85.52151
## 2015 103.28199 102.95833  87.32258  71.85556  55.34543  59.44583  61.57124
## 2016  75.26210  48.35201  95.18952  67.05417  55.25806  64.08889  74.10753
## 2017 132.08333  77.61905                                                  
##            Aug       Sep       Oct       Nov       Dec
## 2013  59.41532  61.74306  98.97849  84.30972  92.30914
## 2014  64.41331  70.99903 114.25806 103.31708  67.94247
## 2015  45.68548  53.15000  77.78226 125.42639 165.81989
## 2016  50.55242  59.25139  83.71640 106.37361 156.83468
## 2017                                                  
## 
## $seasonal
##            Jan       Feb       Mar       Apr       May       Jun       Jul
## 2013                     1.1536745 0.9261272 0.7002633 0.7396729 0.8814532
## 2014 1.1888050 1.2376261 1.1536745 0.9261272 0.7002633 0.7396729 0.8814532
## 2015 1.1888050 1.2376261 1.1536745 0.9261272 0.7002633 0.7396729 0.8814532
## 2016 1.1888050 1.2376261 1.1536745 0.9261272 0.7002633 0.7396729 0.8814532
## 2017 1.1888050 1.2376261                                                  
##            Aug       Sep       Oct       Nov       Dec
## 2013 0.6454082 0.7402740 1.1617698 1.2774791 1.3474468
## 2014 0.6454082 0.7402740 1.1617698 1.2774791 1.3474468
## 2015 0.6454082 0.7402740 1.1617698 1.2774791 1.3474468
## 2016 0.6454082 0.7402740 1.1617698 1.2774791 1.3474468
## 2017                                                  
## 
## $trend
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
## 2013                         NA       NA       NA       NA       NA       NA
## 2014 87.14031 88.07252 88.66643 89.68875 91.11737 90.89406 89.45337 86.81522
## 2015 81.22384 79.44558 77.92155 75.65801 75.05941 80.05886 82.96959 79.52683
## 2016 78.40901 79.13415 79.59116 80.09264 79.54603 78.37778 80.37095 83.95796
## 2017       NA       NA                                                      
##           Sep      Oct      Nov      Dec
## 2013 91.94639 91.93741 91.32442 88.40104
## 2014 84.12981 83.04094 82.26463 82.16314
## 2015 77.57935 77.70708 77.50339 77.69321
## 2016       NA       NA       NA       NA
## 2017                                    
## 
## $random
##            Jan       Feb       Mar       Apr       May       Jun       Jul
## 2013                            NA        NA        NA        NA        NA
## 2014 1.0955578 1.4317686 0.9645571 1.0431149 0.9276228 0.8632645 1.0846244
## 2015 1.0696223 1.0471341 0.9713723 1.0254979 1.0529682 1.0038581 0.8418984
## 2016 0.8074204 0.4936978 1.0366711 0.9039877 0.9920094 1.1054779 1.0460777
## 2017        NA        NA                                                  
##            Aug       Sep       Oct       Nov       Dec
## 2013        NA 0.9071121 0.9266773 0.7226649 0.7749536
## 2014 1.1495959 1.1400135 1.1843349 0.9831170 0.6136951
## 2015 0.8900821 0.9254749 0.8615884 1.2668186 1.5839518
## 2016 0.9329225        NA        NA        NA        NA
## 2017                                                  
## 
## $figure
##  [1] 1.1536745 0.9261272 0.7002633 0.7396729 0.8814532 0.6454082 0.7402740
##  [8] 1.1617698 1.2774791 1.3474468 1.1888050 1.2376261
## 
## $type
## [1] "multiplicative"
## 
## attr(,"class")
## [1] "decomposed.ts"
plot(multi_decomp_PM25)

# 
# plot(add_decomp$x[,'PM2.5'])
# plot(add_decomp$seasonal)
# # plot(add_decomp$trend)
# # plot(add_decomp$random)
# 

add_decomp_PM25$trend
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
## 2013                         NA       NA       NA       NA       NA       NA
## 2014 87.14031 88.07252 88.66643 89.68875 91.11737 90.89406 89.45337 86.81522
## 2015 81.22384 79.44558 77.92155 75.65801 75.05941 80.05886 82.96959 79.52683
## 2016 78.40901 79.13415 79.59116 80.09264 79.54603 78.37778 80.37095 83.95796
## 2017       NA       NA                                                      
##           Sep      Oct      Nov      Dec
## 2013 91.94639 91.93741 91.32442 88.40104
## 2014 84.12981 83.04094 82.26463 82.16314
## 2015 77.57935 77.70708 77.50339 77.69321
## 2016       NA       NA       NA       NA
## 2017
add_decomp$trend[,1]
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
## 2013                         NA       NA       NA       NA       NA       NA
## 2014 87.14031 88.07252 88.66643 89.68875 91.11737 90.89406 89.45337 86.81522
## 2015 81.22384 79.44558 77.92155 75.65801 75.05941 80.05886 82.96959 79.52683
## 2016 78.40901 79.13415 79.59116 80.09264 79.54603 78.37778 80.37095 83.95796
## 2017       NA       NA                                                      
##           Sep      Oct      Nov      Dec
## 2013 91.94639 91.93741 91.32442 88.40104
## 2014 84.12981 83.04094 82.26463 82.16314
## 2015 77.57935 77.70708 77.50339 77.69321
## 2016       NA       NA       NA       NA
## 2017
add_decomp_PM25$random
##               Jan          Feb          Mar          Apr          May
## 2013                                     NA           NA           NA
## 2014  10.33382068  46.82059124  -2.59649089   2.65392485  -7.54908995
## 2015   6.04034552   2.34186903  -3.19569409   1.89576779   4.66626068
## 2016 -19.16472204 -51.95301612   3.00162914  -7.34024848   0.09227343
## 2017           NA           NA                                       
##               Jun          Jul          Aug          Sep          Oct
## 2013           NA           NA           NA  -8.54569396  -6.66689181
## 2014 -11.19966299   5.66916168   6.55083331   8.52685202  17.50914281
## 2015   1.04248834 -11.79732274  -4.88859599  -2.77171390 -13.63280684
## 2016   7.36661881   3.33760522  -4.45279316           NA           NA
## 2017                                                                 
##               Nov          Dec
## 2013 -28.59846862 -22.96012658
## 2014  -0.53131766 -41.08889239
## 2015  26.33923044  61.25846313
## 2016           NA           NA
## 2017
add_decomp$random[,1]
##              Jan         Feb         Mar         Apr         May         Jun
## 2013                                  NA          NA          NA          NA
## 2014  -28.960982   38.124721    6.001469   31.715562   14.891982    6.638761
## 2015  -33.254457   -6.354002    5.402266   30.957405   27.107332   18.880913
## 2016  -58.459524  -60.648887   11.599589   21.721389   22.533345   25.205043
## 2017          NA          NA                                                
##              Jul         Aug         Sep         Oct         Nov         Dec
## 2013          NA          NA   -5.349203   15.764394  -47.676030  -90.936947
## 2014   29.924432   13.773747   11.723343   39.940429  -19.608879 -109.065713
## 2015   12.457948    2.334318    0.424777    8.798479    7.261669   -6.718357
## 2016   27.592876    2.770121          NA          NA          NA          NA
## 2017
# ts_by_month[,'PM2.5']

Moving average smoothing

STL.PM2.5<- stl( ts_by_month[,'PM2.5'] , t.window = 13, s.window = 'periodic')
plot(STL.PM2.5)

STL.PM10<- stl( ts_by_month[,'PM10'] , t.window = 13, s.window = 'periodic')
plot(STL.PM10)

STL.NO2<- stl( ts_by_month[,'NO2'] , t.window = 13, s.window = 'periodic')
plot(STL.NO2)

STL.SO2<- stl( ts_by_month[,'NO2'] , t.window = 13, s.window = 'periodic')
plot(STL.SO2)

STL.CO<- stl( ts_by_month[,'CO'] , t.window = 13, s.window = 'periodic')
plot(STL.CO)

STL.O3<- stl( ts_by_month[,'O3'] , t.window = 13, s.window = 'periodic')
plot(STL.O3)

STL.TEMP<- stl( ts_by_month[,'TEMP'] , t.window = 13, s.window = 'periodic')
plot(STL.TEMP)

STL.PRES<- stl( ts_by_month[,'PRES'] , t.window = 13, s.window = 'periodic')
plot(STL.PRES)

STL.RAIN<- stl( ts_by_month[,'RAIN'] , t.window = 13, s.window = 'periodic')
plot(STL.RAIN)

auto.arima(remainder(STL.PM2.5))
## Series: remainder(STL.PM2.5) 
## ARIMA(0,0,0) with zero mean 
## 
## sigma^2 = 357.8:  log likelihood = -209.23
## AIC=420.46   AICc=420.55   BIC=422.33
x <- lapply(1:25, function(i){
  p <- Box.test(remainder(STL.PM2.5), lag = i, type = "Ljung-Box")
  output <- data.frame(lag = i, p_value = p$p.value)
 return(output) }) %>% bind_rows

plot(x = x$lag,
      y = x$p_value, ylim = c(0,1),
      main = "Series white_noise (PM2.5) - Ljung-Box Test",
      xlab = "Lag", ylab = "P-Value")
 
abline(h = 0.05, col="red", lwd=3, lty=2)

auto.arima(remainder(STL.PM10))
## Series: remainder(STL.PM10) 
## ARIMA(2,0,1) with zero mean 
## 
## Coefficients:
##          ar1      ar2      ma1
##       0.5851  -0.6317  -0.8254
## s.e.  0.1170   0.1137   0.0924
## 
## sigma^2 = 180:  log likelihood = -192.29
## AIC=392.57   AICc=393.5   BIC=400.06
x <- lapply(1:25, function(i){
  p <- Box.test(remainder(STL.PM10), lag = i, type = "Ljung-Box")
  output <- data.frame(lag = i, p_value = p$p.value)
 return(output) }) %>% bind_rows

plot(x = x$lag,
      y = x$p_value, ylim = c(0,1),
      main = "Series white_noise(PM10) - Ljung-Box Test",
      xlab = "Lag", ylab = "P-Value")
 
abline(h = 0.05, col="red", lwd=3, lty=2)

auto.arima(remainder(STL.SO2))
## Series: remainder(STL.SO2) 
## ARIMA(0,0,0) with zero mean 
## 
## sigma^2 = 32.73:  log likelihood = -151.83
## AIC=305.65   AICc=305.74   BIC=307.52
x <- lapply(1:25, function(i){
  p <- Box.test(remainder(STL.SO2), lag = i, type = "Ljung-Box")
  output <- data.frame(lag = i, p_value = p$p.value)
 return(output) }) %>% bind_rows

plot(x = x$lag,
      y = x$p_value, ylim = c(0,1),
      main = "Series white_noise(SO2) - Ljung-Box Test",
      xlab = "Lag", ylab = "P-Value")
 
abline(h = 0.05, col="red", lwd=3, lty=2)

auto.arima(remainder(STL.NO2))
## Series: remainder(STL.NO2) 
## ARIMA(0,0,0) with zero mean 
## 
## sigma^2 = 32.73:  log likelihood = -151.83
## AIC=305.65   AICc=305.74   BIC=307.52
x <- lapply(1:25, function(i){
  p <- Box.test(remainder(STL.NO2), lag = i, type = "Ljung-Box")
  output <- data.frame(lag = i, p_value = p$p.value)
 return(output) }) %>% bind_rows

plot(x = x$lag,
      y = x$p_value, ylim = c(0,1),
      main = "Series white_noise(NO2) - Ljung-Box Test",
      xlab = "Lag", ylab = "P-Value")
 
abline(h = 0.05, col="red", lwd=3, lty=2)

auto.arima(remainder(STL.CO))
## Series: remainder(STL.CO) 
## ARIMA(0,0,0) with zero mean 
## 
## sigma^2 = 53698:  log likelihood = -329.5
## AIC=660.99   AICc=661.08   BIC=662.86
x <- lapply(1:25, function(i){
  p <- Box.test(remainder(STL.CO), lag = i, type = "Ljung-Box")
  output <- data.frame(lag = i, p_value = p$p.value)
 return(output) }) %>% bind_rows

plot(x = x$lag,
      y = x$p_value, ylim = c(0,1),
      main = "Series white_noise(CO) - Ljung-Box Test",
      xlab = "Lag", ylab = "P-Value")
 
abline(h = 0.05, col="red", lwd=3, lty=2)

auto.arima(remainder(STL.O3))
## Series: remainder(STL.O3) 
## ARIMA(0,0,0) with zero mean 
## 
## sigma^2 = 51.6:  log likelihood = -162.76
## AIC=327.51   AICc=327.6   BIC=329.38
x <- lapply(1:25, function(i){
  p <- Box.test(remainder(STL.O3), lag = i, type = "Ljung-Box")
  output <- data.frame(lag = i, p_value = p$p.value)
 return(output) }) %>% bind_rows

plot(x = x$lag,
      y = x$p_value, ylim = c(0,1),
      main = "Series white_noise(O3) - Ljung-Box Test",
      xlab = "Lag", ylab = "P-Value")
 
abline(h = 0.05, col="red", lwd=3, lty=2)

auto.arima(remainder(STL.TEMP))
## Series: remainder(STL.TEMP) 
## ARIMA(1,0,1) with zero mean 
## 
## Coefficients:
##          ar1      ma1
##       0.3815  -0.8530
## s.e.  0.2102   0.1376
## 
## sigma^2 = 0.5325:  log likelihood = -52.3
## AIC=110.6   AICc=111.14   BIC=116.21
x <- lapply(1:25, function(i){
  p <- Box.test(remainder(STL.TEMP), lag = i, type = "Ljung-Box")
  output <- data.frame(lag = i, p_value = p$p.value)
 return(output) }) %>% bind_rows

plot(x = x$lag,
      y = x$p_value, ylim = c(0,1),
      main = "Series white_noise(TEMP) - Ljung-Box Test",
      xlab = "Lag", ylab = "P-Value")
 
abline(h = 0.05, col="red", lwd=3, lty=2)

auto.arima(remainder(STL.PRES))
## Series: remainder(STL.PRES) 
## ARIMA(0,0,0) with zero mean 
## 
## sigma^2 = 1.942:  log likelihood = -84.04
## AIC=170.09   AICc=170.17   BIC=171.96
x <- lapply(1:25, function(i){
  p <- Box.test(remainder(STL.PRES), lag = i, type = "Ljung-Box")
  output <- data.frame(lag = i, p_value = p$p.value)
 return(output) }) %>% bind_rows

plot(x = x$lag,
      y = x$p_value, ylim = c(0,1),
      main = "Series white_noise(PRES) - Ljung-Box Test",
      xlab = "Lag", ylab = "P-Value")
 
abline(h = 0.05, col="red", lwd=3, lty=2)

auto.arima(remainder(STL.RAIN))
## Series: remainder(STL.RAIN) 
## ARIMA(2,0,0) with zero mean 
## 
## Coefficients:
##           ar1      ar2
##       -0.5866  -0.2711
## s.e.   0.1379   0.1360
## 
## sigma^2 = 0.001314:  log likelihood = 91.96
## AIC=-177.91   AICc=-177.37   BIC=-172.3
x <- lapply(1:25, function(i){
  p <- Box.test(remainder(STL.RAIN), lag = i, type = "Ljung-Box")
  output <- data.frame(lag = i, p_value = p$p.value)
 return(output) }) %>% bind_rows

plot(x = x$lag,
      y = x$p_value, ylim = c(0,1),
      main = "Series white_noise(RAIN) - Ljung-Box Test",
      xlab = "Lag", ylab = "P-Value")
 
abline(h = 0.05, col="red", lwd=3, lty=2)

forcest

forecast(STL.PM2.5, h=5, method = 'naive')
##          Point Forecast      Lo 80    Hi 80     Lo 95    Hi 95
## Mar 2017       77.18490  44.111134 110.2587  26.60294 127.7669
## Apr 2017       56.62231   9.848936 103.3957 -14.91139 128.1560
## May 2017       43.71513 -13.570317 101.0006 -43.89540 131.3257
## Jun 2017       53.04516 -13.102381 119.1927 -48.11877 154.2091
## Jul 2017       53.96631 -19.988884 127.9215 -59.13840 167.0710
STL.PM2.5 %>% forecast(method="naive", h=5) %>%
   autoplot(ylab = 'PM2.5') + ggtitle('Forecasting')

ts_heatmap(ts_by_month[,'PM2.5'],title = " Heatmap - the PM2.5 concentration in Wanshou xigong")
forecast(STL.PM10, h=5, method = 'naive')
##          Point Forecast     Lo 80    Hi 80     Lo 95    Hi 95
## Mar 2017      120.79043 88.517348 153.0635  71.43301 170.1479
## Apr 2017       93.15092 47.509882 138.7919  23.34898 162.9529
## May 2017       84.11630 28.217673 140.0149  -1.37327 169.6059
## Jun 2017       67.14942  2.603248 131.6956 -31.56543 165.8643
## Jul 2017       62.57884 -9.585969 134.7437 -47.78771 172.9454
STL.PM10 %>% forecast(method="naive", h=5) %>%
   autoplot(ylab = 'PM10') + ggtitle('Forecasting')

ts_heatmap(ts_by_month[,'PM10'],title = " Heatmap - the PM10 concentration in Wanshou xigong")
forecast(STL.SO2, h=5, method = 'naive')
##          Point Forecast    Lo 80    Hi 80     Lo 95    Hi 95
## Mar 2017       61.73933 51.45773 72.02093 46.014975 77.46368
## Apr 2017       48.03199 33.49161 62.57237 25.794394 70.26958
## May 2017       45.55294 27.74468 63.36120 18.317562 72.78832
## Jun 2017       43.79880 23.23559 64.36200 12.350091 75.24750
## Jul 2017       40.59675 17.60639 63.58712  5.436031 75.75748
STL.SO2 %>% forecast(method="naive", h=5) %>%
   autoplot(ylab = 'SO2') + ggtitle('Forecasting')

ts_heatmap(ts_by_month[,'SO2'],title = " Heatmap - the SO2 concentration in Wanshou xigong")
forecast(STL.NO2, h=5, method = 'naive')
##          Point Forecast    Lo 80    Hi 80     Lo 95    Hi 95
## Mar 2017       61.73933 51.45773 72.02093 46.014975 77.46368
## Apr 2017       48.03199 33.49161 62.57237 25.794394 70.26958
## May 2017       45.55294 27.74468 63.36120 18.317562 72.78832
## Jun 2017       43.79880 23.23559 64.36200 12.350091 75.24750
## Jul 2017       40.59675 17.60639 63.58712  5.436031 75.75748
STL.NO2 %>% forecast(method="naive", h=5) %>%
   autoplot(ylab = 'NO2') + ggtitle('Forecasting')

ts_heatmap(ts_by_month[,'NO2'],title = " Heatmap - the NO2 concentration in Wanshou xigong")
forecast(STL.CO, h=5, method = 'naive')
##          Point Forecast      Lo 80    Hi 80     Lo 95    Hi 95
## Mar 2017       973.1480  572.96264 1373.333  361.1173 1585.179
## Apr 2017       541.0066  -24.94091 1106.954 -324.5354 1406.549
## May 2017       493.1916 -199.94972 1186.333 -566.8766 1553.260
## Jun 2017       705.9207  -94.44995 1506.291 -518.1406 1929.982
## Jul 2017       653.5837 -241.25789 1548.425 -714.9584 2022.126
STL.CO %>% forecast(method="naive", h=5) %>%
   autoplot(ylab = 'CO') + ggtitle('Forecasting')

ts_heatmap(ts_by_month[,'CO'],title = " Heatmap - the CO concentration in Wanshou xigong")
forecast(STL.O3, h=5, method = 'naive')
##          Point Forecast    Lo 80     Hi 80    Lo 95     Hi 95
## Mar 2017       57.51380 45.54433  69.48328 39.20807  75.81954
## Apr 2017       74.45243 57.52503  91.37983 48.56420 100.34065
## May 2017       96.37771 75.64596 117.10945 64.67124 128.08418
## Jun 2017      104.09568 80.15672 128.03464 67.48420 140.70716
## Jul 2017      100.94909 74.18452 127.71366 60.01621 141.88196
STL.O3 %>% forecast(method="naive", h=5) %>%
   autoplot(ylab = 'O3') + ggtitle('Forecasting')

ts_heatmap(ts_by_month[,'O3'],title = " Heatmap - the O3 concentration in Wanshou xigong")
forecast(STL.TEMP, h=5, method = 'naive')
##          Point Forecast     Lo 80    Hi 80     Lo 95    Hi 95
## Mar 2017       10.39841  8.668394 12.12842  7.752579 13.04424
## Apr 2017       17.13377 14.687157 19.58038 13.391999 20.87554
## May 2017       23.26267 20.266196 26.25914 18.679959 27.84538
## Jun 2017       26.24717 22.787142 29.70720 20.955513 31.53883
## Jul 2017       28.83308 24.964650 32.70151 22.916826 34.74934
STL.TEMP %>% forecast(method="naive", h=5) %>%
   autoplot(ylab = 'TEMP') + ggtitle('Forecasting')

ts_heatmap(ts_by_month[,'TEMP'],title = " Heatmap - the TEMP concentration in Wanshou xigong")
forecast(STL.PRES, h=5, method = 'naive')
##          Point Forecast     Lo 80    Hi 80     Lo 95    Hi 95
## Mar 2017       1016.220 1013.4819 1018.959 1012.0323 1020.408
## Apr 2017       1010.481 1006.6084 1014.354 1004.5584 1016.404
## May 2017       1004.456  999.7135 1009.199  997.2028 1011.710
## Jun 2017       1001.340  995.8635 1006.817  992.9643 1009.716
## Jul 2017       1000.111  993.9874 1006.234  990.7461 1009.475
STL.PRES %>% forecast(method="naive", h=5) %>%
   autoplot(ylab = 'PRES') + ggtitle('Forecasting')

ts_heatmap(ts_by_month[,'PRES'],title = " Heatmap - the PRES concentration in Wanshou xigong")
forecast(STL.RAIN, h=5, method = 'naive')
##          Point Forecast       Lo 80      Hi 80       Lo 95     Hi 95
## Mar 2017    0.001549438 -0.09152873 0.09462761 -0.14080134 0.1439002
## Apr 2017    0.016914377 -0.11471803 0.14854679 -0.18440002 0.2182288
## May 2017    0.041943287 -0.11927283 0.20315941 -0.20461549 0.2885021
## Jun 2017    0.126730572 -0.05942577 0.31288691 -0.15797098 0.4114321
## Jul 2017    0.250805358  0.04267624 0.45893448 -0.06750066 0.5691114
STL.RAIN %>% forecast(method="naive", h=5) %>%
   autoplot(ylab = 'RAIN') + ggtitle('Forecasting')

ts_heatmap(ts_by_month[,'RAIN'],title = " Heatmap - the RAIN concentration in Wanshou xigong")
convert_date_to_weekday <- function(day, month, year){

  date <- as.POSIXlt(paste(as.character(year) , as.character(month), as.character(day), sep="-"), tz = "UTC")
  
  
  weekday <- weekdays(date)

  return (weekday)
  
}
  

convert_date_to_weekday(25,11,2022)
## [1] "Friday"
data_weekday <- data

data_weekday$weekday <- mapply(convert_date_to_weekday, data$day, data$month, data$year)

grouped_weekday_year = aggregate(cbind(PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES, DEWP,RAIN,WSPM) ~ year + weekday , data = data_weekday, FUN = mean, na.rm = TRUE)

grouped_weekday_year$weekday <- factor(grouped_weekday_year$weekday, levels = c("Monday", "Tuesday","Wednesday","Thursday", "Friday","Saturday","Sunday"))


weekdayorder = c("Monday", "Tuesday","Wednesday","Thursday", "Friday","Saturday","Sunday")

grouped_weekday_year = grouped_weekday_year[order(match(grouped_weekday_year$weekday,weekdayorder)),]







library(lattice)





xyplot( PM2.5 ~ weekday  ,
       type="b",
       group=year,
       data = grouped_weekday_year,
       auto.key = TRUE,
)

xyplot( CO ~ weekday  ,
       type="b",
       group=year,
       data = grouped_weekday_year,
       auto.key = TRUE,
)

xyplot( TEMP ~ weekday  ,
       type="b",
       group=year,
       data = grouped_weekday_year,
       auto.key = TRUE,
)

xyplot( NO2 ~ weekday  ,
       type="b",
       group=year,
       data = grouped_weekday_year,
       auto.key = TRUE,
)

grouped_hour_year = aggregate(cbind(PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES, DEWP,RAIN,WSPM) ~  year + hour , data = data, FUN = mean, na.rm = TRUE)

grouped_hour = aggregate(cbind(PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES, DEWP,RAIN,WSPM) ~   hour , data = data, FUN = mean, na.rm = TRUE)

grouped_hour_median = aggregate(cbind(PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES, DEWP,RAIN,WSPM) ~   hour , data = data, FUN = median, na.rm = TRUE)

library(lattice)


xyplot( PM2.5 ~ hour  ,
       type="b",
       group=(year),
       data = grouped_hour_year,
       auto.key = TRUE,
)

xyplot( PM2.5 ~ hour  ,
       type="b",
       data = grouped_hour,
       auto.key = TRUE,
)

xyplot( PM2.5 ~ hour  ,
       type="b",
       data = grouped_hour_median,
       auto.key = TRUE,
)

xyplot( TEMP ~ hour  ,
       type="b",
       group=(year),
       data = grouped_hour_year,
       auto.key = TRUE,
)

ts_by_hour <- ts(data, frequency = 24)

ts.plot(ts_by_hour)

time_index <- seq(from = as.POSIXct("2013-03-01 00:00"), 
                  to = as.POSIXct("2017-2-28 23:00"), by = "hour")



xts_by_hour <- xts(data[, c("PM2.5","PM10","SO2","NO2","CO","O3","TEMP","PRES", "DEWP","RAIN","WSPM")], order.by = time_index, frequency = 24)

attr(xts_by_hour, 'frequency')<- 24

plot.xts(xts_by_hour[,"PM2.5"])

boxplot(xts_by_hour [,'PM2.5'] ~ cycle(xts_by_hour[,'PM2.5']))

decpmpose_hours <-decompose(as.ts(xts_by_hour[,'PM2.5']))

plot(decpmpose_hours)